a=read.csv("ReuseDatasetsSnap.csv")
help(read.csv)
a
summary(a)
aov=aov(ResolvableScore~DatasetType,data=a);aov;summary(aov)#whoops! not categorical vs. continuous
boxplot( ResolvableScore~DatasetType,data=a)#ignore (see above)
poisson = glm(ResolvableScore ~ DatasetType, data=a, family= poisson)#loglinear model (cat vs. cat)
summary(poisson)
library(MASS)
library(help=MASS)
help(polr)
ordlogit=polr(as.ordered(ResolvableScore)~ DatasetType, data = a);summary(ordlogit)
attach(a)
b=table(ResolvableYN,DatasetType);b
chisq.test(table  (DatasetType,ResolvableYN))
cc=table(ResolvableScore,DatasetType);cc
chisq.test(table  (ResolvableScore,DatasetType)) #but why not just chi sq?
bb=table(ResolvableYN,BroaderDatatypes);bb
chisq.test(table  (BroaderDatatypes,ResolvableYN))
ccc=table(ResolvableScore,BroaderDatatypes);ccc
chisq.test(table  (ResolvableScore,BroaderDatatypes))
d=table(ResolvableYN,Journal);d
chisq.test(table  (ResolvableYN,Journal))
e=table(ResolvableScore,Journal);e
chisq.test(table  (ResolvableScore,Journal))
f=table(AttributionYN,Journal);f
chisq.test(table  (AttributionYN,Journal))
g=table(AttributionScore,Journal);g
chisq.test(table  (AttributionScore,Journal))
hh=table(AttributionYN,DatasetType);hh
chisq.test(table  (DatasetType,AttributionYN))
i=table(AttributionScore,DatasetType);i
chisq.test(table  (AttributionScore,DatasetType))
#Outputs to send to Heather
##Tables
#> b=table(ResolvableYN,DatasetType);b
#            DatasetType
#ResolvableYN Bio Ea Eco GA GIS GO GS PA PT XY
#           0  22 35  10  5  10  5 33 19  8  5
#           1   0  0   0  0   0  0 17  0  1  0
#> bb=table(ResolvableYN,BroaderDatatypes);bb
#            BroaderDatatypes
#ResolvableYN EA Eco  G  O PT  S
#           0 35  10 43 41  8 15
#           1  0   0 17  0  1  0
 #> cc=table(ResolvableScore,DatasetType);cc
#               DatasetType
#ResolvableScore Bio Ea Eco GA GIS GO GS PA PT XY
#              0   6 15   3  0   4  0  3  4  1  3
#              1  13 10   6  2   2  4 12 10  5  0
#              2   1  8   0  2   2  1  6  3  0  0
#              3   0  0   0  0   0  0  4  0  0  0
#              4   2  2   1  1   2  0  8  2  2  2
#              5   0  0   0  0   0  0 17  0  1  0
#> ccc=table(ResolvableScore,BroaderDatatypes);ccc
#             BroaderDatatypes
#ResolvableScore EA Eco  G  O PT  S
#              0 15   3  3 10  1  7
#              1 10   6 18 23  5  2
#              2  8   0  9  4  0  2
#              3  0   0  4  0  0  0
#              4  2   1  9  4  2  4
#              5  0   0 17  0  1  0
##Chi-Squared
#> chisq.test(table  (ResolvableScore,DatasetType))
#        Pearson's Chi-squared test
#data:  table(ResolvableScore, DatasetType)
#X-squared = 98.1825, df = 45, p-value = 7.922e-06
#Warning message:
#In chisq.test(table(ResolvableScore, DatasetType)) :
#  Chi-squared approximation may be incorrect
##Linear Model-Poisson  (alternative = binomial or zero inflated for "Resolvable YN")
#> poisson = glm(ResolvableScore~DatasetType,data=a,family=poisson)
#> summary(poisson)
#Call:
#glm(formula = ResolvableScore ~ DatasetType, family = poisson,
#    data = a)
#Deviance Residuals:
#     Min        1Q    Median        3Q       Max
#-2.47386  -1.37229  -0.04478   0.60369   2.29458
#Coefficients:
#                   Estimate Std. Error z value Pr(>|z|)
#(Intercept)         0.04445    0.20851   0.213   0.8312
#DatasetTypeEa  -0.07344    0.26998  -0.272   0.7856
#DatasetTypeEco -0.04445    0.37878  -0.117   0.9066
#DatasetTypeGA   0.64870    0.37879   1.713   0.0868 .
#DatasetTypeGIS  0.29202    0.33898   0.861   0.3890
#DatasetTypeGO   0.13787    0.45842   0.301   0.7636
#DatasetTypeGS   1.07396    0.22364   4.802 1.57e-06 ***
#DatasetTypePA   0.18916    0.29180   0.648   0.5168
#DatasetTypePT   0.64870    0.31470   2.061   0.0393 *
#DatasetTypeXY   0.42555    0.41042   1.037   0.2998
#---
#Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1   1
#(Dispersion parameter for poisson family taken to be 1)
#    Null deviance: 283.03  on 169  degrees of freedom
#Residual deviance: 212.36  on 160  degrees of freedom
#AIC: 566.94
#Number of Fisher Scoring iterations: 5
##Ordered Logit Model (library MASS, function polr)
#polr(formula = as.ordered(ResolvableScore) ~ DatasetType, data = a)
#Coefficients:
#                    Value Std. Error    t value
#DatasetTypeEa  -0.1696435  0.4978595 -0.3407457
#DatasetTypeEco -0.1249959  0.6788473 -0.1841296
#DatasetTypeGA   1.3932769  0.8160008  1.7074454
#DatasetTypeGIS  0.2656731  0.7333286  0.3622839
#DatasetTypeGO   0.6023778  0.8112429  0.7425369
#DatasetTypeGS   2.3786340  0.4917395  4.8371833
#DatasetTypePA   0.3831136  0.5557081  0.6894152
#DatasetTypePT   1.0288666  0.7284316  1.4124410
#DatasetTypeXY  -0.2736481  1.1134912 -0.2457569
#Intercepts:
#    Value   Std. Error t value
#0|1 -0.6708  0.3879    -1.7294
#1|2  1.2789  0.4015     3.1855
#2|3  2.0552  0.4237     4.8511
#3|4  2.2097  0.4285     5.1573
#4|5  3.3495  0.4775     7.0144
#Residual Deviance: 483.3118
#AIC: 511.3118